# BRisk

This code examines some aspects of the real data for comparison with the simulated data. It uses the revised dataset that has no imputation.

# This code is modified to be used locally or on a cluster. 
#seedID = 1;

local=TRUE
# Change this for cluster or local:

if(local){
    #setwd('~/Dropbox/QualityControlImpactsFMRI')
    save.input.data = FALSE
    getOption("mc.cores")
    options(mc.cores=8)
    seed=1
  } else {
    setwd('~/risk_share/QualityControlImpacts')
    save.input.data = FALSE
    options(mc.cores=1)
    seed = seedID
}

a = .libPaths()
.libPaths(c('/home/benjamin.risk/Rlibs',a))

.libPaths()
## [1] "/Library/Frameworks/R.framework/Versions/4.1/Resources/library"
getOption("mc.cores")
## [1] 8
set.seed(seed, "L'Ecuyer-CMRG")
## drtmle: TMLE with doubly robust inference
## Version: 1.1.0
## Loading required package: nnls
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
## Super Learner
## Version: 2.0-28
## Package created on 2021-05-04
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'nloptr' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-3
dat=read.csv('../Data/Master_HeadMotion_wholeGroup_partialCorrelations_ic30_20210803.csv',header=T)

# sort by ID: the simplifies the later indexing, as a merge with 
# propensities sorts by ID:
dat = dat[order(dat$ID),]

table(dat$PrimaryDiagnosis)
## 
## Autism   None 
##    173    372
table(dat$Race,dat$Sex,useNA='always') # Used in Protection of Human Subjects table in grant
##                   
##                      F   M <NA>
##   African American  14  37    0
##   Asian             10  21    0
##   Biracial          20  41    0
##   Caucasian         95 302    0
##   Hispanic           0   1    0
##   Other              0   1    0
##   Unknown            0   2    0
##   <NA>               0   1    0
# note: create a "caucasian/other" race variable that allocates the four children without race to caucasian
dat$Race2 = dat$Race
dat$Race2[dat$Race2%in%c('Hispanic','Other','Unknown')]='Caucasian'
table(dat$Race2)
## 
## African American            Asian         Biracial        Caucasian 
##               51               31               61              401
# Add ADHD variable:
dat$ADHD_Secondary = ifelse(dat$ADHD.Subtype=='No dx',0,1)
table(dat$ADHD_Secondary,dat$ADHD.Subtype,useNA='always') # there should be no strange categories. 2/3/2021: Looks good
##       
##        Combined Hyperactive/Impulsive Inattentive No dx <NA>
##   0           0                     0           0   425    0
##   1          69                     7          44     0    0
##   <NA>        0                     0           0     0    0
table(dat$PrimaryDiagnosis[dat$KKI_criteria=='Pass'])
## 
## Autism   None 
##    114    308
table(dat$PrimaryDiagnosis[dat$Ciric_length=='Pass'])
## 
## Autism   None 
##     29    151
# subset to signal components:
#ic_class = read_excel('./DeconfoundedFMRI/componentLabels_pca85_ica30.xlsx')
ic_class = read_excel('../componentLabels_pca85_ica30.xlsx')

# create names we want to exclude:
artifacts = c(1:nrow(ic_class))[ic_class$signal==0]
badICs = paste0('ic',artifacts)
allICs = paste0('ic',1:nrow(ic_class))
badEdges=NULL
for (i in 1:nrow(ic_class)) {
  for (j in 1:length(badICs)) {
    badEdges = c(badEdges,paste0(allICs[i],'.',badICs[j]))
  }
}
for (i in 1:length(badICs)) {
  for (j in 1:nrow(ic_class)) {
    badEdges = c(badEdges,paste0(badICs[i],'.',allICs[j]))
  }
}

dat2 = dat[,!names(dat)%in%badEdges]

# Variables containing Fisher z partial correlations 

startEdgeidx = which(names(dat2)=='ic1.ic2')
endEdgeidx = which(names(dat2)=='ic29.ic30')
names(dat2)[startEdgeidx:endEdgeidx] #18*17/2 = nEdges
##   [1] "ic1.ic2"   "ic1.ic4"   "ic1.ic8"   "ic1.ic13"  "ic1.ic14"  "ic1.ic15" 
##   [7] "ic1.ic17"  "ic1.ic19"  "ic1.ic21"  "ic1.ic22"  "ic1.ic24"  "ic1.ic25" 
##  [13] "ic1.ic26"  "ic1.ic27"  "ic1.ic28"  "ic1.ic29"  "ic1.ic30"  "ic2.ic4"  
##  [19] "ic2.ic8"   "ic2.ic13"  "ic2.ic14"  "ic2.ic15"  "ic2.ic17"  "ic2.ic19" 
##  [25] "ic2.ic21"  "ic2.ic22"  "ic2.ic24"  "ic2.ic25"  "ic2.ic26"  "ic2.ic27" 
##  [31] "ic2.ic28"  "ic2.ic29"  "ic2.ic30"  "ic4.ic8"   "ic4.ic13"  "ic4.ic14" 
##  [37] "ic4.ic15"  "ic4.ic17"  "ic4.ic19"  "ic4.ic21"  "ic4.ic22"  "ic4.ic24" 
##  [43] "ic4.ic25"  "ic4.ic26"  "ic4.ic27"  "ic4.ic28"  "ic4.ic29"  "ic4.ic30" 
##  [49] "ic8.ic13"  "ic8.ic14"  "ic8.ic15"  "ic8.ic17"  "ic8.ic19"  "ic8.ic21" 
##  [55] "ic8.ic22"  "ic8.ic24"  "ic8.ic25"  "ic8.ic26"  "ic8.ic27"  "ic8.ic28" 
##  [61] "ic8.ic29"  "ic8.ic30"  "ic13.ic14" "ic13.ic15" "ic13.ic17" "ic13.ic19"
##  [67] "ic13.ic21" "ic13.ic22" "ic13.ic24" "ic13.ic25" "ic13.ic26" "ic13.ic27"
##  [73] "ic13.ic28" "ic13.ic29" "ic13.ic30" "ic14.ic15" "ic14.ic17" "ic14.ic19"
##  [79] "ic14.ic21" "ic14.ic22" "ic14.ic24" "ic14.ic25" "ic14.ic26" "ic14.ic27"
##  [85] "ic14.ic28" "ic14.ic29" "ic14.ic30" "ic15.ic17" "ic15.ic19" "ic15.ic21"
##  [91] "ic15.ic22" "ic15.ic24" "ic15.ic25" "ic15.ic26" "ic15.ic27" "ic15.ic28"
##  [97] "ic15.ic29" "ic15.ic30" "ic17.ic19" "ic17.ic21" "ic17.ic22" "ic17.ic24"
## [103] "ic17.ic25" "ic17.ic26" "ic17.ic27" "ic17.ic28" "ic17.ic29" "ic17.ic30"
## [109] "ic19.ic21" "ic19.ic22" "ic19.ic24" "ic19.ic25" "ic19.ic26" "ic19.ic27"
## [115] "ic19.ic28" "ic19.ic29" "ic19.ic30" "ic21.ic22" "ic21.ic24" "ic21.ic25"
## [121] "ic21.ic26" "ic21.ic27" "ic21.ic28" "ic21.ic29" "ic21.ic30" "ic22.ic24"
## [127] "ic22.ic25" "ic22.ic26" "ic22.ic27" "ic22.ic28" "ic22.ic29" "ic22.ic30"
## [133] "ic24.ic25" "ic24.ic26" "ic24.ic27" "ic24.ic28" "ic24.ic29" "ic24.ic30"
## [139] "ic25.ic26" "ic25.ic27" "ic25.ic28" "ic25.ic29" "ic25.ic30" "ic26.ic27"
## [145] "ic26.ic28" "ic26.ic29" "ic26.ic30" "ic27.ic28" "ic27.ic29" "ic27.ic30"
## [151] "ic28.ic29" "ic28.ic30" "ic29.ic30"
(nEdges = endEdgeidx - startEdgeidx+1)
## [1] 153
# note: check whether all "PASS" have correlations. Should be all FALSE:
table(is.na(dat2[,startEdgeidx]) & dat2$KKI_criteria=='Pass')
## 
## FALSE 
##   545
# No missing. Cool.

# Motion variables:
# MeanFramewiseDisplacement.KKI
# MaxFramewiseDisplacement.KKI
# FramesWithFDLessThanOrEqualTo250microns
# Here, we control for sex and motion effects. We do not control for age, but 
# age is included in the propensity and outcome models, resulting
# in the age distribution for each group (including fails), which is 
# approximately equal between groups. 

# Create a variable r.ic1.ic2 for each signal pairing that 
# contains the residuals from the three motion variables
temp = dat2[,c('PrimaryDiagnosis','MeanFramewiseDisplacement.KKI','MaxFramewiseDisplacement.KKI',"FramesWithFDLessThanOrEqualTo250microns","Sex","Race2","SES.Family","ic1.ic2")]
completeCases = complete.cases(temp)

#nrow(temp[is.na(temp$ic1.ic2) ,])

t.values.lm=NULL
t.values.naive=NULL

lm.variables=c('MeanFramewiseDisplacement.KKI')
# NOTE: for better interpretation of the plot of mean changes, center all variables:
for (i in c(startEdgeidx:endEdgeidx)) {
  model.temp = lm(dat2[,i]~scale(dat2$MeanFramewiseDisplacement.KKI,center = TRUE, scale=FALSE)+scale(dat2$MaxFramewiseDisplacement.KKI,center=TRUE,scale=FALSE)+scale(dat2$FramesWithFDLessThanOrEqualTo250microns,center=TRUE,scale=FALSE)+dat2$Sex+dat2$Race2+scale(dat2$SES.Family,center=TRUE,scale=FALSE)+dat2$PrimaryDiagnosis)
  # create the motion-adjusted fconn data:
  dat2[completeCases,paste0('r.',names(dat2)[i])]=residuals(model.temp)+coef(model.temp)["(Intercept)"]+coef(model.temp)["dat2$PrimaryDiagnosisNone"]*(dat2$PrimaryDiagnosis[completeCases]=='None')

    # Audits:
    # check ordering is same:
    trash.audit = residuals(model.temp)+fitted(model.temp)
    trash = abs(dat2[completeCases,i]-trash.audit)<1e-16
    if(!all(trash)) stop('Ordering mismatch -- check variables in complete cases')     # check that t-statistic on residuals is approximately equal to t-statistic from lm:
    t.values.lm = c(t.values.lm,coef(summary(model.temp))['dat2$PrimaryDiagnosisNone',3])
    t.values.naive =   c(t.values.naive,t.test(dat2[completeCases & dat2$PrimaryDiagnosis=='None',paste0('r.',names(dat2)[i])],dat2[completeCases & dat2$PrimaryDiagnosis=='Autism',paste0('r.',names(dat2)[i])])$statistic)
}

# Make all ADOS in TD = 0
dat2$ADOS.Comparable.Total[dat2$PrimaryDiagnosis=='None'] = 0

# specify learners for super learner. gn is the propensity model and Qbar is the outcome model,
# also used for imputing PANESS: 
my.SL.libs.gn= c("SL.earth","SL.glmnet","SL.gam","SL.glm","SL.ranger","SL.step","SL.step.interaction","SL.xgboost","SL.mean")
my.SL.libs.Qbar= c("SL.earth","SL.glmnet","SL.gam","SL.glm","SL.ranger","SL.ridge","SL.step","SL.step.interaction","SL.svm","SL.xgboost","SL.mean")


#<---------------------------------

# Dataset for propensity model:
gn.variables = c('KKI_criteria','PrimaryDiagnosis','ADHD_Secondary','AgeAtScan','handedness','CurrentlyOnStimulants','PANESS.TotalOverflowNotAccountingForAge','WISC.GAI','DuPaulHome.InattentionRaw','DuPaulHome.HyperactivityRaw','ADOS.Comparable.Total')

# these indices will be used in the outcome model and drtmle as well:
temp.data = dat2[,c(gn.variables)]

# note: include variables from the linear model here to define a consistent set of
# complete predictor cases for the propensity and outcome models
idx.all.cc = complete.cases(temp.data) & complete.cases(dat2[,c('Sex','SES.Family','Race2')])
temp.data = temp.data[idx.all.cc,]
#temp.data$AgeAtScanXdx = (temp.data$PrimaryDiagnosis=='Autism')*temp.data$AgeAtScan
#AgeAtScan
gn.xmat = data.frame(model.matrix(KKI_criteria~.,data=temp.data)[,-1])

# Create a variable equal to one if the observation is used:
# NOTE: due to missing observations in variables in the initial linear
# model, need to include linear model variables in this: 
dat2$CompletePredictorCases = idx.all.cc
sum(dat2$CompletePredictorCases)
## [1] 485

For RtoR describing simulations section——————————–>

Delta.KKI = ifelse(temp.data$KKI_criteria=='Pass',1,0)
mean(Delta.KKI[gn.xmat$PrimaryDiagnosisNone==0])
## [1] 0.7153285
mean(Delta.KKI[gn.xmat$PrimaryDiagnosisNone==1])
## [1] 0.8390805
model.gam = mgcv::gam(Delta.KKI~s(ADOS.Comparable.Total),data=gn.xmat,family=binomial)
summary(model.gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Delta.KKI ~ s(ADOS.Comparable.Total)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.4533     0.1182   12.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                            edf Ref.df Chi.sq p-value   
## s(ADOS.Comparable.Total) 1.513  1.836   15.3 0.00152 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0302   Deviance explained = 2.86%
## UBRE = -0.028622  Scale est. = 1         n = 485
mean(gn.xmat$ADOS.Comparable.Total[gn.xmat$PrimaryDiagnosisNone==0])
## [1] 14.34307
newd <- gn.xmat[1, ] # grab any row; we are going to change temperature only
newd$ADOS.Comparable.Total <- mean(gn.xmat$ADOS.Comparable.Total[gn.xmat$PrimaryDiagnosisNone==0]) - 1e-05 # subtract some small number
y1 <- predict(model.gam, newd)
newd$ADOS.Comparable.Total <- mean(gn.xmat$ADOS.Comparable.Total[gn.xmat$PrimaryDiagnosisNone==0]) + 1e-05 # add some small number
y2 <- predict(model.gam, newd)

slope at mean ADOS in real data (full sample)

(y2 - y1)/2e-05
##         467 
## -0.07740867

effect at mean ADOS in real data (full sample)

mean(gn.xmat$ADOS.Comparable.Total[gn.xmat$PrimaryDiagnosisNone==0])*(y2 - y1)/2e-05
##       467 
## -1.110278
newd <- gn.xmat[1, ] # grab any row; we are going to change temperature only
newd$ADOS.Comparable.Total <- 20 - 1e-05 # subtract some small number
y1 <- predict(model.gam, newd)
newd$ADOS.Comparable.Total <- 20 + 1e-05 # add some small number
y2 <- predict(model.gam, newd)
(y2 - y1)/2e-05
##        467 
## -0.1000773
## <-----------------------------------------

(propensity.KKI = mcSuperLearner(Y = Delta.KKI, X = gn.xmat, family=binomial(link='logit'),SL.library = my.SL.libs.gn, cvControl = list(V = 10), method='method.CC_nloglik')) # 10-fold CV
## 
## Call:  
## mcSuperLearner(Y = Delta.KKI, X = gn.xmat, family = binomial(link = "logit"),  
##     SL.library = my.SL.libs.gn, method = "method.CC_nloglik", cvControl = list(V = 10)) 
## 
## 
## 
##                             Risk       Coef
## SL.earth_All            485.5807 0.13608363
## SL.glmnet_All           464.0483 0.53460726
## SL.gam_All              478.7078 0.00000000
## SL.glm_All              475.4646 0.00000000
## SL.ranger_All           475.5737 0.13471628
## SL.step_All             477.2543 0.00000000
## SL.step.interaction_All 548.0877 0.00000000
## SL.xgboost_All          545.0375 0.08515944
## SL.mean_All             481.6851 0.10943339
propensities.SL = propensity.KKI$SL.predict

# merge propensities back to dataset:
prop.asdtd = data.frame('ID'=dat2$ID[idx.all.cc],'propensities.SL'=propensities.SL,Delta.KKI)
dat3 = merge(dat2,prop.asdtd,all = TRUE) # here, we keep all observations

# this merge changes the ordering of ID. Hence, new index vectors are created,
# and new xmat need to be made even when using the same variables in the 
# propensity and outcome models

# save datasets to be loaded for DRTMLE:
# NOTE: These datasets include the propensities, which change with each seed:
#if (save.input.data) {
#  save(file=paste0('./Data/DataWithPropensities_seed',seed,'.RData'),dat3)
#}

# check that the merge is correct:
table(dat3$KKI_criteria,dat3$Delta.KKI)
##       
##          0   1
##   Fail  95   0
##   Pass   0 390
# these should be all in agreement (0 on off diagonal)
all(idx.all.cc==!is.na(dat3$propensities.SL))
## [1] TRUE
# Delta should be correlated with propensities: 
cor(1*(dat3$KKI_criteria=='Pass'),dat3$propensities.SL,use='pairwise.complete.obs')
## [1] 0.6619975

Create outcome regression datasets: NOTE: using same variables for propensity and outcome model.

Qn.variables =  gn.variables
#NOTE: KKI_criteria is not used in prediction, but is included as a trick to construct the design matrix

# complete cases pass defined by pass, no missing propensities (behavioral variables), and no missing fconn (driven by the initial linear model with imbalanced variables)
idx.pass.cc = dat3$KKI_criteria=='Pass' & !is.na(dat3$propensities.SL)
# use in naive estimates in for loop:
idx.pass.cc.asd = idx.pass.cc & dat3$PrimaryDiagnosis=='Autism'
idx.pass.cc.td = idx.pass.cc & dat3$PrimaryDiagnosis=='None'

# used in drtmle:
idx.all.cc.asd = idx.all.cc & dat3$PrimaryDiagnosis == 'Autism'
idx.all.cc.td = idx.all.cc & dat3$PrimaryDiagnosis == 'None'

# These datasets necessary to run SuperLearner without error
# (note: ordering of indices differs from the propensity models due to the merge with the propensities)
temp.data = dat3[idx.pass.cc,Qn.variables]
Qn.xmat.fit = data.frame(model.matrix(KKI_criteria~.,data=temp.data))[,-1]

temp.data = dat3[idx.all.cc,c('ID',Qn.variables)]
Qn.xmat.predict = data.frame(model.matrix(KKI_criteria~.,data=temp.data))[,-1]

# Separate ASD and TD datasets are necessary to obtain the DRTMLE estimates:
temp.data = Qn.xmat.predict[Qn.xmat.predict$PrimaryDiagnosisNone==0,]
# check that the order matches idx.all.cc.asd:
all(temp.data$ID==dat3$ID[idx.all.cc.asd])
## [1] TRUE
Qn.xmat.predict.asd = data.frame(model.matrix(numeric(nrow(temp.data))~.,data=temp.data)[,-c(1,2)])

temp.data = Qn.xmat.predict[Qn.xmat.predict$PrimaryDiagnosisNone==1,]
# check that the order matches idx.all.cc.td:
all(temp.data$ID==dat3$ID[idx.all.cc.td])
## [1] TRUE
Qn.xmat.predict.td = data.frame(model.matrix(numeric(nrow(temp.data))~.,data=temp.data))[,-c(1,2)]


## FOR RtoR----->

par(mfrow=c(1,2))
hist(Qn.xmat.predict.asd$ADOS.Comparable.Total,main='a) ASD severity in real data (full sample)', col="#7FAE88")
abline(v=mean(Qn.xmat.predict.asd$ADOS.Comparable.Total),col='red', lwd=3)

Mean ADOS severity in the real data (full sample)

mean(Qn.xmat.predict.asd$ADOS.Comparable.Total)
## [1] 14.34307
# usable and unusable ADOS:
quantile(Qn.xmat.predict$ADOS.Comparable.Total[Qn.xmat.predict$PrimaryDiagnosisNone==0])
##   0%  25%  50%  75% 100% 
##    7   11   14   17   26
hist(Qn.xmat.fit$ADOS.Comparable.Total[Qn.xmat.fit$PrimaryDiagnosisNone==0], main='B) ASD severity in real data (usable)', col="#5D60AA")
abline(v=mean(Qn.xmat.fit$ADOS.Comparable.Total[Qn.xmat.fit$PrimaryDiagnosisNone==0]),col='red', lwd=3) 

Mean ADOS severity in the real data (usable after lenient motion QC)

mean(Qn.xmat.fit$ADOS.Comparable.Total[Qn.xmat.fit$PrimaryDiagnosisNone==0])
## [1] 13.90816
# usable ADOS:
quantile(Qn.xmat.fit$ADOS.Comparable.Total[Qn.xmat.fit$PrimaryDiagnosisNone==0])
##    0%   25%   50%   75%  100% 
##  7.00 11.00 13.00 16.75 23.00

Examine the range of correlations for ADOS and the range of intercepts for ASD:

startEdgeidx=which(names(dat3)=='r.ic1.ic2')

p.value.ados = rep(NA,nEdges)
cor.ados=rep(NA,nEdges)
for (edgeidx in 1:nEdges) {
  dat3.edgeidx = startEdgeidx+edgeidx-1 
  cor.ados[edgeidx]=cor(dat3[idx.pass.cc,dat3.edgeidx],dat3$ADOS.Comparable.Total[idx.pass.cc])
  model.gam.out = gam(dat3[idx.pass.cc,dat3.edgeidx]~s(ADOS.Comparable.Total)+PrimaryDiagnosis,data=dat3[idx.pass.cc,])
  plot(model.gam.out,terms = 's(ADOS.Comparable.Total)')
}

max(cor.ados)
## [1] 0.1698128
min(cor.ados)
## [1] -0.2142168
## <---------------------
# note on power:
# the wilcoxon power does not vary with the numbers in each group,
# which is a bummer.
# here is an example of how the group size should matter:
library(pwr)
pwr.t2n.test(n1 = 20, n2=80, d = 0.5, sig.level = 0.05, power = NULL,alternative = c("two.sided"))
## 
##      t test power calculation 
## 
##              n1 = 20
##              n2 = 80
##               d = 0.5
##       sig.level = 0.05
##           power = 0.5081857
##     alternative = two.sided
pwr.t2n.test(n1 = 50, n2=50, d = 0.5, sig.level = 0.05, power = NULL,alternative = c("two.sided"))
## 
##      t test power calculation 
## 
##              n1 = 50
##              n2 = 50
##               d = 0.5
##       sig.level = 0.05
##           power = 0.6968934
##     alternative = two.sided